Code
library(magrittr)First, simulate genotype data. Then, for each subject’s genotype, simulate replicate traits.
library(magrittr)set.seed(2023-04-15)
n_tr <- 10000
n_te <- 100
n <- n_tr + n_te
# n is total number of subjects
n_reps <- 50
# n_reps is number of replicates
n_snp <- 1
# n_snp is number of SNPs
# simulate genotypes matrix
geno <- matrix(sample(c(0,1,2), n*n_snp, replace=TRUE, prob=c(0.25,0.5,0.25)), nrow=n, ncol=n_snp)
# simulate phenotype
beta <- 1
y <- as.numeric(geno %*% beta) %*% t(rep(1, n_reps)) +
matrix(data = rnorm(n * n_reps), nrow = n, ncol = n_reps)
# prepare for splitting into training and testing
# get test subject ids
test_ids <- sample(1:n, n_te, replace = FALSE)
# organize data
dat <- tibble::as_tibble(y) %>%
dplyr::rename_with(function(x){ num <- stringr::str_extract(x, "[0-9]+")
return(paste0("pheno", num))}
) %>%
dplyr::bind_cols(geno %>% tibble::as_tibble() %>% dplyr::rename(geno = 1)) %>%
dplyr::mutate(id = 1:n) %>% # fix this when using more than one SNP
dplyr::mutate(in_test_set = id %in% test_ids)Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
# split into training and testing
training <- dat %>% dplyr::filter(!in_test_set)
testing <- dat %>% dplyr::filter(in_test_set)
testing2 <- testing %>% dplyr::mutate(fold = as.integer(NA))
# use all training with leave K out
alpha <- 0.1Above, we specified the number of replicates for the simulations. We created 50 replicate traits for the same 1.01^{4} subjects. Note that each subject has only 1 SNP(s).
import numpy as np
import pandas as pd
def leastsq_minL2(X,Y,X1,tol=1e-8):
uX,dX,vX = np.linalg.svd(X)
rX = (dX>=dX[0]*tol).sum()
betahat = (vX[:rX].T/dX[:rX]).dot(uX[:,:rX].T.dot(Y))
return X1.dot(betahat)
def compute_PIs(X,Y,X1,alpha = 0.1, K=10, fit_muh_fun = leastsq_minL2):
n = len(Y)
#n1 = X1.shape[0]
n1 = len(X1)
###############################
# CV+
###############################
n_K = np.floor(n/K).astype(int)
base_inds_to_delete = np.arange(n_K).astype(int)
resids_LKO = np.zeros(n)
muh_LKO_vals_testpoint = np.zeros((n,n1))
for i in range(K):
inds_to_delete = (base_inds_to_delete + n_K*i).astype(int)
X0 = np.delete(X, inds_to_delete)
X0_2d = X0[:, np.newaxis]
Y0 = np.delete(Y, inds_to_delete)
X0d = X[inds_to_delete]
X0d_2d = X0d[:, np.newaxis]
X1_2d = X1[:, np.newaxis]
muh_vals_LKO = fit_muh_fun(X0_2d, Y0, np.r_[X0d_2d, X1_2d])
resids_LKO[inds_to_delete] = np.abs(Y[inds_to_delete] - muh_vals_LKO[:n_K, 0])
for inner_K in range(n_K):
muh_LKO_vals_testpoint[inds_to_delete[inner_K]] = muh_vals_LKO[n_K:, 0]
ind_Kq = (np.ceil((1-alpha)*(n+1))).astype(int)
PIs_dict = {'CV+' : pd.DataFrame(\
np.c_[np.sort(muh_LKO_vals_testpoint.T - resids_LKO,axis=1).T[-ind_Kq], \
np.sort(muh_LKO_vals_testpoint.T + resids_LKO,axis=1).T[ind_Kq-1]],\
columns = ['lower','upper'])}
return pd.concat(PIs_dict.values(), axis=1, keys=PIs_dict.keys())np.random.seed(98765)
# simulation
n_vals = np.array([100, 1000, 10000]) # training set size
n1 = 100 # test set size
#SNR = 10
ntrial = 50
alpha = 0.1
#dim_vals = np.arange(5,205,5)
d = 1
#beta = np.random.normal(size=d)
#beta = beta/np.sqrt((beta**2).sum()) * np.sqrt(SNR)
beta = np.array([1])
#X1 = np.random.normal(size=(n1,d))
vector = [0, 1, 2]
X1 = np.random.choice(vector, size=(n1, d), replace=True)
Y1 = X1.dot(beta) + np.random.normal(size=n1)
method = 'CV+'
# define new objects for storing Y and X
Y_dic = dict.fromkeys(n_vals, 0)
results = pd.DataFrame(columns = ['itrial','d','method','coverage','width', 'n'])
# loop over n_vals
for n in n_vals:
Ya = np.zeros((n, ntrial))
X = np.random.choice(vector, size=(n, d), replace=True)
for itrial in range(ntrial):
Y = X.dot(beta) + np.random.normal(size=n)
Ya[:, itrial] = Y
# store X, Y for later use with R code
# store as numpy arrays
PIs = compute_PIs(X = X, Y = Y, X1 = X1)
coverage = ((PIs[method]['lower'] <= Y1)&(PIs[method]['upper'] >= Y1)).mean()
width = (PIs[method]['upper'] - PIs[method]['lower']).mean()
results.loc[len(results)]=[itrial,d,method,coverage,width,n]
results.to_csv('python-results.csv')library(reticulate)
saveRDS(py$results, file = "python-results.rds")
library(ggplot2)
p <- py$results %>%
ggplot() + geom_boxplot(aes(x = as.factor(n), y = width, fill = as.factor(n)))
plotly::ggplotly(p)res_list <- list()
vars_list_list_list <- list()
n_folds_vec <- c(5, 10, 20, 50)
for (n_folds in n_folds_vec){
# partition training data into K folds
folds <- split(training$id, sample(rep(1:n_folds, length.out = n_tr)))
training2_pre <- training %>%
dplyr::mutate(fold = id %>% purrr::map_int(~which(sapply(folds, function(x) . %in% x)))) %>%
dplyr::arrange(fold)
tictoc::tic() # timing
tl <- list()
n_per_fold_vec <- c(n_tr / n_folds, n_tr / (n_folds * 10))
vars_list_list <- list()
for (n_per_fold in n_per_fold_vec){
training2 <- training2_pre %>%
dplyr::group_by(fold) %>%
dplyr::slice_sample(n = n_per_fold) %>%
dplyr::ungroup()
# store each trait's outputs
out <- list()
vars_list <- list()
# loop over traits
for (trait_num in 1:n_reps){
tr2_one_trait <- training2 %>%
dplyr::select(id, fold, geno, tidyselect::ends_with(paste0("pheno", trait_num))) %>%
dplyr::rename(pheno = tidyselect::ends_with(paste0("pheno", trait_num)))
te2_one_trait <- testing2 %>%
dplyr::select(id, fold, geno, tidyselect::ends_with(paste0("pheno", trait_num))) %>%
dplyr::rename(pheno = tidyselect::ends_with(paste0("pheno", trait_num)))
# regress leaving one fold out
preds <- list()
vars <- list()
for (fold_num in 1:n_folds) {
# get training data
train <- tr2_one_trait %>% dplyr::filter(fold != fold_num)
# get testing data
test <- tr2_one_trait %>% dplyr::filter(fold == fold_num)
# fit model
fit <- lm(pheno ~ 1 + geno, data = train)
# predict
foo <- test %>% dplyr::bind_rows(te2_one_trait)
foo$pred <- predict(fit, newdata = foo)
foo$fold_left_out <- fold_num
result <- foo %>%
dplyr::mutate(beta1_hat = coef(fit)[2],
beta0_hat = coef(fit)[1],
se_beta1_hat = summary(fit)$coefficients[2, 2],
se_beta0_hat = summary(fit)$coefficients[1, 2]
)
# save predictions
preds[[fold_num]] <- result
te_geno_mat <- cbind(1, te2_one_trait$geno)
tr_geno_mat <- cbind(1, train$geno)
vars[[fold_num]] <- diag(te_geno_mat %*% solve(t(tr_geno_mat) %*% tr_geno_mat) %*% t(te_geno_mat))
}
vars_list[[trait_num]] <- vars
# assemble predicted values
# get absolute residuals
preds_training <- preds %>%
dplyr::bind_rows() %>%
dplyr::filter(!is.na(fold)) %>% # keep only training data
dplyr::mutate(absolute_residual = abs(pheno - pred)) %>%
dplyr::select( - fold_left_out)
preds_test <- preds %>%
dplyr::bind_rows() %>%
dplyr::filter(is.na(fold))
# get indexes
plus_index <- ceiling((1 - alpha) * (nrow(preds_training) + 1))
minus_index <- floor(alpha * (nrow(preds_training) + 1))
# go one by one through test set (testing2)
test_list <- list()
for (i in 1:nrow(testing2)){
tt <- testing2[i, ]
pt2 <- preds_test %>%
dplyr::filter(id == tt$id) %>% # our only use of tt
dplyr::rename_with(function(x)paste0("test_", x))
# pt2 contains the five predicted values for a single test subject
nrow(pt2) # 5
preds_all <- preds_training %>%
dplyr::left_join(pt2, by = c("fold" = "test_fold_left_out")) %>%
dplyr::mutate(test_fitted_plus_absolute_residual = test_pred + absolute_residual,
test_fitted_minus_absolute_residual = test_pred - absolute_residual)
uu <- sort(preds_all$test_fitted_plus_absolute_residual)[plus_index]
ll <- sort(preds_all$test_fitted_minus_absolute_residual)[minus_index]
# make a tibble with exactly one row
test_list[[i]] <- preds_all %>%
dplyr::select(test_id, test_geno, test_pheno, test_beta1_hat, fold) %>%
dplyr::mutate(lower = ll, upper = uu) %>%
dplyr::distinct() %>%
tidyr::pivot_wider(names_from = fold,
values_from = test_beta1_hat,
names_prefix = "beta1_hat_fold_"
)
}
test_tib <- test_list %>%
dplyr::bind_rows() %>%
dplyr::mutate(in_interval = test_pheno >= lower & test_pheno <= upper) %>%
dplyr::mutate(interval_width = upper - lower) %>%
dplyr::mutate(training_set_size = n_per_fold * n_folds,
trait_num = trait_num)
out[[trait_num]] <- test_tib
}
tl[[as.character(n_per_fold * n_folds)]] <- out
vars_list_list[[as.character(n_per_fold * n_folds)]] <- vars_list
}
res_list[[as.character(n_folds)]] <- tl
vars_list_list_list[[as.character(n_folds)]] <- vars_list_list
tictoc::toc() # timing
}
saveRDS(res_list, "res_list.rds")
saveRDS(vars_list_list_list, "vars_list_list_list.rds")#test_tib_thin <- test_tib %>%
# dplyr::select(test_id, test_geno)
# need to accommodate varying number of beta1_hat_fold_* columns
tt_intermediate <- tl %>%
dplyr::bind_rows(.id = "id")
results <- tt_intermediate %>%
dplyr::group_by(training_set_size, trait_num) %>%
dplyr::summarise(mean_interval_width = mean(interval_width),
sd_interval_width = sd(interval_width),
mean_in_interval = mean(in_interval),
sd_in_interval = sd(in_interval),
#beta1_hat_fold_1 = mean(beta1_hat_fold_1),
#beta1_hat_fold_2 = mean(beta1_hat_fold_2),
#beta1_hat_fold_3 = mean(beta1_hat_fold_3),
#beta1_hat_fold_4 = mean(beta1_hat_fold_4),
#beta1_hat_fold_5 = mean(beta1_hat_fold_5),
median_interval_width = median(interval_width)
) %>%
dplyr::ungroup() %>%
# dplyr::mutate(mean_b1 = purrr::pmap_dbl(.l = list(beta1_hat_fold_1,
# beta1_hat_fold_2,
# beta1_hat_fold_3,
# beta1_hat_fold_4,
# beta1_hat_fold_5),
# .f = function(x, y, z, w, v) mean(c(x, y, z, w, v))),
# sd_b1 = purrr::pmap_dbl(.l = list(beta1_hat_fold_1,
# beta1_hat_fold_2,
# beta1_hat_fold_3,
# beta1_hat_fold_4,
# beta1_hat_fold_5),
# .f = function(x, y, z, w, v) sd(c(x, y, z, w, v)))
# )
results %>%
knitr::kable() %>%
print()library(ggplot2)
tt_intermediate %>%
ggplot(aes(y = interval_width, colour = as.factor(training_set_size), x = as.factor(trait_num))) +
geom_boxplot()
#ggsave(here::here("figures", "interval_width_boxplot.png"), width = 10, height = 10)p1 <- results %>%
ggplot(aes(x = mean_b1, y = median_interval_width, colour = as.factor(training_set_size), replicate_num = trait_num)) +
geom_point()
plotly::ggplotly(p1, tooltip = c("x", "y", "colour", "replicate_num"))sessioninfo::session_info()─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 18.04.6 LTS
system x86_64, linux-gnu
ui X11
language en_US:
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/Detroit
date 2023-05-15
pandoc 1.19.2.4 @ /usr/bin/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.3)
colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.2)
crosstalk 1.2.0 2021-11-04 [1] CRAN (R 4.1.2)
data.table 1.14.8 2023-02-17 [1] CRAN (R 4.2.2)
digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.2)
dplyr 1.1.1 2023-03-22 [1] CRAN (R 4.2.3)
ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.2.1)
evaluate 0.20 2023-01-17 [1] CRAN (R 4.2.2)
fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.2)
farver 2.1.1 2022-07-06 [1] CRAN (R 4.2.3)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.3)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.3)
ggplot2 * 3.4.2 2023-04-03 [1] CRAN (R 4.2.3)
glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
gtable 0.3.3 2023-03-21 [1] CRAN (R 4.2.3)
here 1.0.1 2020-12-13 [2] CRAN (R 4.1.1)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.2.3)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.2.2)
httr 1.4.5 2023-02-24 [1] CRAN (R 4.2.3)
jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.2.3)
knitr 1.42 2023-01-25 [1] CRAN (R 4.2.3)
labeling 0.4.2 2020-10-20 [2] CRAN (R 4.0.3)
lattice 0.21-8 2023-04-05 [1] CRAN (R 4.2.3)
lazyeval 0.2.2 2019-03-15 [2] CRAN (R 4.0.3)
lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.2)
magrittr * 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
Matrix 1.5-4 2023-04-04 [1] CRAN (R 4.2.3)
munsell 0.5.0 2018-06-12 [2] CRAN (R 4.0.3)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.3)
pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.0.3)
plotly 4.10.1 2022-11-07 [1] CRAN (R 4.2.2)
png 0.1-8 2022-11-29 [1] CRAN (R 4.2.3)
purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.2)
R6 2.5.1 2021-08-19 [2] CRAN (R 4.1.1)
rappdirs 0.3.3 2021-01-31 [2] CRAN (R 4.0.3)
Rcpp 1.0.10 2023-01-22 [1] CRAN (R 4.2.2)
reticulate * 1.28 2023-01-27 [1] CRAN (R 4.3.0)
rlang 1.1.0 2023-03-14 [1] CRAN (R 4.2.2)
rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.2.3)
rprojroot 2.0.3 2022-04-02 [2] CRAN (R 4.2.0)
scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.3)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.1.2)
stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.2)
stringr 1.5.0 2022-12-02 [1] CRAN (R 4.2.3)
tibble 3.2.1 2023-03-20 [1] CRAN (R 4.2.3)
tidyr 1.3.0 2023-01-24 [1] CRAN (R 4.2.3)
tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.2)
utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.3)
vctrs 0.6.1 2023-03-22 [1] CRAN (R 4.2.3)
viridisLite 0.4.1 2022-08-22 [1] CRAN (R 4.2.3)
withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0)
xfun 0.38 2023-03-24 [1] CRAN (R 4.2.3)
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.3)
[1] /net/mulan/home/fredboe/R/x86_64-pc-linux-gnu-library/4.0
[2] /net/mario/cluster/lib/R/site-library-bionic-40
[3] /usr/local/lib/R/site-library
[4] /usr/lib/R/site-library
[5] /usr/lib/R/library
─ Python configuration ───────────────────────────────────────────────────────
python: /net/mulan/home/fredboe/miniconda3/envs/barber1/bin/python3
libpython: /net/mulan/home/fredboe/miniconda3/envs/barber1/lib/libpython3.10.so
pythonhome: /net/mulan/home/fredboe/miniconda3/envs/barber1:/net/mulan/home/fredboe/miniconda3/envs/barber1
version: 3.10.9 (main, Mar 8 2023, 10:47:38) [GCC 11.2.0]
numpy: /net/mulan/home/fredboe/miniconda3/envs/barber1/lib/python3.10/site-packages/numpy
numpy_version: 1.23.5
python versions found:
/net/mulan/home/fredboe/miniconda3/envs/barber1/bin/python3
/net/mulan/home/fredboe/miniconda3/envs/barber1/bin/python
/net/mulan/home/fredboe/miniconda3/bin/python
/net/mulan/home/fredboe/miniconda3/envs/ldsc/bin/python
──────────────────────────────────────────────────────────────────────────────
# git commit info
gr <- git2r::repository(here::here()) %>%
git2r::commits()
gr[[1]] [425c373] 2023-05-15: feat: Expanded python & R code to save python output df as a rds file